
library(dplyr)
library(tidyr)
library(nlme)
library(ggplot2)
load('mysession.RData') ## generated when running models.R
figPath = '~/Dropbox/Apps/Overleaf/Python Package Download Trends/figures/'

## our two key models are from the vulnerability data and the ai data.

### start with vuln data, with our best model model_full_vulNOAI

vul_resid <- vDF %>%
  # level=1 means take the residuals after accounting
  # for the grouping effects (i.e. after random effects)
  mutate(l1_resid = resid(model_full_vulNOAI, level=1),
         # also include a column for standardized residuals
         l1_resid_std = l1_resid/sd(l1_resid))

head(vul_resid)


## l1 QQ

v.qq <- ggplot(data=vul_resid, aes(sample=l1_resid_std)) +
  stat_qq() +
  theme_bw() +
  xlab("Theoretical normal quantiles") +
  ylab("Standardized level-1 residuals") +
  ggtitle("Q-Q plot of standardized level-1 residuals") +
  # add an identity line y=x to compare equality of distributions
  geom_abline(intercept=0, slope=1)


pdf(paste0(figPath, 'V.L1.QQ.pdf'))
v.qq
dev.off()


## homoskedasticity

v.qq.bysub <- ggplot(data=vul_resid, aes(x=package, y=l1_resid_std)) +
  geom_point() +
  # horizontal reference line
  geom_hline(yintercept=0) +
  theme_bw() +
  xlab("Package") +
  ylab("Standardized level-1 residuals") +
  ggtitle("Level-1 residuals by package")

pdf(paste0(figPath, 'V.L1.QQ.bysub.pdf'))
v.qq.bysub
dev.off()

## let's look at l2

vul_resid_2 <- data.frame(package=unique(vul_resid$package), rand_int = ranef(model_full_vulNOAI)[,"(Intercept)"])
r2sd <- sd(vul_resid_2$rand_int)

vul_resid_2$rand_int_std <- vul_resid_2$rand_int/r2sd



### Normality

##Checking normality of random effects with quantile-quantile plots:

v.qq.l2 <- ggplot(data=vul_resid_2, aes(sample=rand_int_std)) +
  stat_qq() +
  theme_bw() +
  # facet by effect type to separate out the two
  xlab("Theoretical normal quantiles") +
  ylab("Standardized random effects") +
  ggtitle("Q-Q plot of standardized random effects") +
  # add an identity line y=x to compare equality of distributions
  geom_abline(intercept=0, slope=1)

pdf(paste0(figPath, 'V.L2.QQ.pdf'))
v.qq.l2
dev.off()


### Homoscedasticity
# plot residuals by package
v.qq.l2.bypkg <- ggplot(data=vul_resid_2, aes(x=package, y=rand_int_std)) +
  geom_point() +
  # horizontal reference line
  geom_hline(yintercept=0) +
  theme_bw() +
  xlab("Package ID") +
  ylab("Standardized level-2 residuals") +
  ggtitle("Level-2 residuals by package")

pdf(paste0(figPath, 'V.L2.QQ.bypkg.pdf'))
v.qq.l2.bypkg
dev.off()

### now let's do AI data

ai_resid <- fullDF %>%
  # level=1 means take the residuals after accounting
  # for the grouping effects (i.e. after random effects)
  mutate(l1_resid = resid(model_5_full, level=1),
         # also include a column for standardized residuals
         l1_resid_std = l1_resid/sd(l1_resid))

head(ai_resid)


## l1 QQ

### these crash!
ai.qq.l1 <- ggplot(data=ai_resid, aes(sample=l1_resid_std)) +
  stat_qq() +
  theme_bw() +
  xlab("Theoretical normal quantiles") +
  ylab("Standardized level-1 residuals") +
  ggtitle("Q-Q plot of standardized level-1 residuals") +
  # add an identity line y=x to compare equality of distributions
  geom_abline(intercept=0, slope=1)

pdf(paste0(figPath, 'AI.L1.QQ.pdf'))
ai.qq.l1
dev.off()

## homoskedasticity

ai.qq.l1.bypkg <- ggplot(data=ai_resid, aes(x=package, y=l1_resid_std)) +
  geom_point() +
  # horizontal reference line
  geom_hline(yintercept=0) +
  theme_bw() +
  xlab("Package") +
  ylab("Standardized level-1 residuals") +
  ggtitle("Level-1 residuals by package")

pdf(paste0(figPath, 'AI.L1.QQ.bypkg.pdf'))
ai.qq.l1.bypkg
dev.off()

## let's look at l2

ai_resid_2 <- data.frame(package=unique(ai_resid$package), rand_int = ranef(model_5_full)[,"(Intercept)"])
r2sd <- sd(ai_resid_2$rand_int)

ai_resid_2$rand_int_std <- ai_resid_2$rand_int/r2sd



### Normality

##Checking normality of random effects with quantile-quantile plots:

ai.qq.l2 <- ggplot(data=ai_resid_2, aes(sample=rand_int_std)) +
  stat_qq() +
  theme_bw() +
  # facet by effect type to separate out the two
  xlab("Theoretical normal quantiles") +
  ylab("Standardized random effects") +
  ggtitle("Q-Q plot of standardized random effects") +
  # add an identity line y=x to compare equality of distributions
  geom_abline(intercept=0, slope=1)

pdf(paste0(figPath, 'AI.L2.QQ.pdf'))
ai.qq.l2
dev.off()


### Homoscedasticity
# plot residuals by pkg
ai.qq.l2.bypkg <- ggplot(data=ai_resid_2, aes(x=package, y=rand_int_std)) +
  geom_point() +
  # horizontal reference line
  geom_hline(yintercept=0) +
  theme_bw() +
  xlab("Package") +
  ylab("Standardized level-2 residuals") +
  ggtitle("Level-2 residuals by package")

ai.qq.l2.bypkg

pdf(paste0(figPath, 'AI.L2.QQ.byPKG.pdf'))
ai.qq.l2.bypkg
dev.off()
